This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.
# get data for hosp_5_years
outcome <- "hosp_5_year"
load(paste("data", outcome, "train_data.Rda", sep="/"))
load(paste("data", outcome, "test_data.Rda", sep="/"))
load(paste("data", outcome, "train_means.Rda", sep="/"))
load(paste("data", outcome, "train_sds.Rda", sep="/"))
# Get only required columns
preds_cts = c(
"decades_from_start", # This is for imputing and including time (hospital effects)
"age",
"age_2",
"height",
"weight",
"bmi",
"fev1",
"fvc",
"fivc",
"fev1_fvc",
"fev1_fev6",
"pef",
"mmef",
"dlco",
"spo2"
)
preds_bin = c("hosp_past_year", "sex") # Binary predictors
preds_categ = c("smoking_status", "mmrc") # Categorical
preds_inter = c("sex:age")
train_data <- train_data[c(preds_bin, preds_cts, preds_categ, outcome)]
train_data
## # A tibble: 4,544 × 20
## hosp_past_year sex decades_from_start age age_2 height weight bmi
## <lgl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 1 0.101 -0.0993 0.0179 0.777 0.166 -0.206
## 2 FALSE 0 0.254 0.484 0.425 -0.978 -1.01 -0.802
## 3 FALSE 1 0.456 -1.15 2.39 1.15 0.00241 -0.514
## 4 FALSE 1 -0.0506 0.0809 0.0119 -0.127 -0.161 -0.109
## 5 FALSE 1 -0.431 -0.260 0.123 0.245 0.0569 -0.0586
## 6 FALSE 1 0.633 0.372 0.252 -0.606 -0.325 -0.0489
## 7 FALSE 1 0.354 -0.199 0.0719 0.352 0.520 0.376
## 8 FALSE 1 0.786 -0.299 0.162 1.10 0.166 -0.346
## 9 FALSE 0 0.583 0.154 0.0429 -0.180 -0.842 -0.890
## 10 FALSE 1 -0.0248 -0.767 1.07 0.139 -0.188 -0.274
## # ℹ 4,534 more rows
## # ℹ 12 more variables: fev1 <dbl>, fvc <dbl>, fivc <dbl>, fev1_fvc <dbl>,
## # fev1_fev6 <dbl>, pef <dbl>, mmef <dbl>, dlco <dbl>, spo2 <dbl>,
## # smoking_status <fct>, mmrc <fct>, hosp_5_year <lgl>
library(mice)
nimp <- 10
pred_mat <- make.predictorMatrix(train_data)
# pred_mat[c("height", "weight"), c("bmi", "dlco")] <- 0
meth <- make.method(train_data)
imp <- futuremice(train_data, n.core=5,
predictorMatrix = pred_mat, method = meth,
m = nimp, maxit = 20, print = FALSE
)
imp$loggedEvents
## NULL
plot(imp)
Imputation here shows height, weight, and bmi not being properly mixed
together. The other variables are all sufficiently mixed.
# Don't apply passive imputation since BMI is no longer maintaining its ratio to weight and height
train_data %>%
select(weight, height, bmi) %>%
mutate(bmi_calc = weight / height^2)
## # A tibble: 4,544 × 4
## weight height bmi bmi_calc
## <dbl> <dbl> <dbl> <dbl>
## 1 0.166 0.777 -0.206 0.275
## 2 -1.01 -0.978 -0.802 -1.05
## 3 0.00241 1.15 -0.514 0.00183
## 4 -0.161 -0.127 -0.109 -9.97
## 5 0.0569 0.245 -0.0586 0.946
## 6 -0.325 -0.606 -0.0489 -0.884
## 7 0.520 0.352 0.376 4.21
## 8 0.166 1.10 -0.346 0.138
## 9 -0.842 -0.180 -0.890 -25.9
## 10 -0.188 0.139 -0.274 -9.77
## # ℹ 4,534 more rows
# However, there is a strong correlation
library(naniar)
library(ggplot2)
ggplot(train_data,
aes(x=weight,
y=bmi)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=height,
y=bmi)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=weight,
y=height)) +
geom_miss_point(alpha=0.1)
There is a strong correlation between the standardised weight and bmi.
Relations with height and bmi is hard to see (1/height^2 prop BMI). Weak
correlation between height and weight. Therefore we need to remove
feedback loop between weight and bmi. Also remove feedback between
height and bmi.
pred_mat[c("weight", "height"), "bmi"] <- 0
imp.pas <- futuremice(train_data, n.core=5,
predictorMatrix = pred_mat, method = meth,
m = 10, maxit = 20, print = FALSE
)
plot(imp.pas)
Weight, height and BMI now have better mixing in their chains. Check
quickly the other interactions.
ggplot(train_data,
aes(x=height,
y=age)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=height,
y=fev1)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=height,
y=fvc)) +
geom_miss_point(alpha=0.01)
ggplot(train_data,
aes(x=height,
y=fev1_fvc)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=height,
y=fev1_fev6)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=height,
y=pef)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=height,
y=mmef)) +
geom_miss_point(alpha=0.1)
ggplot(train_data,
aes(x=fev1,
y=fvc)) +
geom_miss_point(alpha=0.01)
# Look at shadows in distributions
data_shadow <- bind_shadow(train_data)
data_shadow
## # A tibble: 4,544 × 40
## hosp_past_year sex decades_from_start age age_2 height weight bmi
## <lgl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 1 0.101 -0.0993 0.0179 0.777 0.166 -0.206
## 2 FALSE 0 0.254 0.484 0.425 -0.978 -1.01 -0.802
## 3 FALSE 1 0.456 -1.15 2.39 1.15 0.00241 -0.514
## 4 FALSE 1 -0.0506 0.0809 0.0119 -0.127 -0.161 -0.109
## 5 FALSE 1 -0.431 -0.260 0.123 0.245 0.0569 -0.0586
## 6 FALSE 1 0.633 0.372 0.252 -0.606 -0.325 -0.0489
## 7 FALSE 1 0.354 -0.199 0.0719 0.352 0.520 0.376
## 8 FALSE 1 0.786 -0.299 0.162 1.10 0.166 -0.346
## 9 FALSE 0 0.583 0.154 0.0429 -0.180 -0.842 -0.890
## 10 FALSE 1 -0.0248 -0.767 1.07 0.139 -0.188 -0.274
## # ℹ 4,534 more rows
## # ℹ 32 more variables: fev1 <dbl>, fvc <dbl>, fivc <dbl>, fev1_fvc <dbl>,
## # fev1_fev6 <dbl>, pef <dbl>, mmef <dbl>, dlco <dbl>, spo2 <dbl>,
## # smoking_status <fct>, mmrc <fct>, hosp_5_year <lgl>, hosp_past_year_NA <fct>,
## # sex_NA <fct>, decades_from_start_NA <fct>, age_NA <fct>, age_2_NA <fct>,
## # height_NA <fct>, weight_NA <fct>, bmi_NA <fct>, fev1_NA <fct>, fvc_NA <fct>,
## # fivc_NA <fct>, fev1_fvc_NA <fct>, fev1_fev6_NA <fct>, pef_NA <fct>, …
Examine is there a correlation between missing data and not taking certain measurements? Specifically, is age or breathlessness particularly correlated with missingness?
ggplot(data_shadow,
aes(x=age,
colour = mmrc_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = mmrc_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=age,
colour = smoking_status_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = smoking_status_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=age,
colour = dlco_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = dlco_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=age,
colour = fivc_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = fivc_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=age,
colour = spo2_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = spo2_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=decades_from_start,
colour = spo2_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=age,
colour = pef_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = pef_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=age,
colour = mmef_NA)) +
geom_density()
ggplot(data_shadow,
aes(x=fev1_fvc,
colour = mmef_NA)) +
geom_density()
For mmrc, smoking status, dlco, and fivc, we see a fairly unbiased
relationship between age and fev1_fvc (representative of disease
severity). However, the expriatory flow tests (PEF and MMEF) both have
different missing data distributions. PEF has a higher proportion of
missing values at mid-high FEV1/FVC values, and no missing at low
FEV1_FVC. Similarly, PEF has little missingness at older ages, but most
of its missing values are for younger patients. Inversely, MMEF is
mostly missing its values around slightly below average values for
FEV1_FVC, and around the mean age (whereas older or younger patients are
having their MMEF measured). HOWEVER, these are both because of their
incredibly small pct missingness (random correlations). This can be seen
through use of the histogram
ggplot(data_shadow,
aes(x=fev1_fvc,
fill = mmef_NA)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We now look at the density plots for the imputed chains.
densityplot(imp.pas)
PEF and MMEF both are noisier, coming from their imputed chains
consisting of a very small number of values from which to create a
density plot. SPo2 imputations are discrete and plotting their densities
is not a good visual representation. The other plots are all good,
however. FIVC and DLCO with the most missing values are very tight fits
to the original distributions.
We also can note that height, weight, bmi, fivc, pef, and dlco are all easily modelled with gaussians, so we can change the method.
meth_orig <- make.method(train_data)
meth_fast <- meth_orig
meth_fast[c("height", "weight", "bmi", "fivc", "pef", "dlco")] <- "norm"
start.time <- Sys.time()
futuremice(train_data, n.core=5,
predictorMatrix = pred_mat, method = meth_orig,
m = nimp, maxit = 20, print = FALSE
)
## Class: mids
## Number of multiple imputations: 10
## Imputation methods:
## hosp_past_year sex decades_from_start age
## "" "" "" ""
## age_2 height weight bmi
## "" "pmm" "pmm" "pmm"
## fev1 fvc fivc fev1_fvc
## "" "" "pmm" ""
## fev1_fev6 pef mmef dlco
## "pmm" "pmm" "pmm" "pmm"
## spo2 smoking_status mmrc hosp_5_year
## "pmm" "polyreg" "polyreg" ""
## PredictorMatrix:
## hosp_past_year sex decades_from_start age age_2 height weight
## hosp_past_year 0 1 1 1 1 1 1
## sex 1 0 1 1 1 1 1
## decades_from_start 1 1 0 1 1 1 1
## age 1 1 1 0 1 1 1
## age_2 1 1 1 1 0 1 1
## height 1 1 1 1 1 0 1
## bmi fev1 fvc fivc fev1_fvc fev1_fev6 pef mmef dlco spo2
## hosp_past_year 1 1 1 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1 1 1 1
## decades_from_start 1 1 1 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1 1 1 1
## age_2 1 1 1 1 1 1 1 1 1 1
## height 0 1 1 1 1 1 1 1 1 1
## smoking_status mmrc hosp_5_year
## hosp_past_year 1 1 1
## sex 1 1 1
## decades_from_start 1 1 1
## age 1 1 1
## age_2 1 1 1
## height 1 1 1
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## Time difference of 1.112067 mins
start.time <- Sys.time()
futuremice(train_data, n.core=5,
predictorMatrix = pred_mat, method = meth_fast,
m = nimp, maxit = 20, print = FALSE
)
## Class: mids
## Number of multiple imputations: 10
## Imputation methods:
## hosp_past_year sex decades_from_start age
## "" "" "" ""
## age_2 height weight bmi
## "" "norm" "norm" "norm"
## fev1 fvc fivc fev1_fvc
## "" "" "norm" ""
## fev1_fev6 pef mmef dlco
## "pmm" "norm" "pmm" "norm"
## spo2 smoking_status mmrc hosp_5_year
## "pmm" "polyreg" "polyreg" ""
## PredictorMatrix:
## hosp_past_year sex decades_from_start age age_2 height weight
## hosp_past_year 0 1 1 1 1 1 1
## sex 1 0 1 1 1 1 1
## decades_from_start 1 1 0 1 1 1 1
## age 1 1 1 0 1 1 1
## age_2 1 1 1 1 0 1 1
## height 1 1 1 1 1 0 1
## bmi fev1 fvc fivc fev1_fvc fev1_fev6 pef mmef dlco spo2
## hosp_past_year 1 1 1 1 1 1 1 1 1 1
## sex 1 1 1 1 1 1 1 1 1 1
## decades_from_start 1 1 1 1 1 1 1 1 1 1
## age 1 1 1 1 1 1 1 1 1 1
## age_2 1 1 1 1 1 1 1 1 1 1
## height 0 1 1 1 1 1 1 1 1 1
## smoking_status mmrc hosp_5_year
## hosp_past_year 1 1 1
## sex 1 1 1
## decades_from_start 1 1 1
## age 1 1 1
## age_2 1 1 1
## height 1 1 1
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## Time difference of 1.098918 mins
There is no notable speedup, so continue to use pmm.
fit <- with(imp, glm(ici(imp) ~ age + height + weight + bmi + sex + fev1 + fvc + fivc + fev1_fvc + fev1_fev6 + pef + mmef + dlco + spo2 + smoking_status + mmrc, family=binomial))
ps <- rep(rowMeans(sapply(fit$analyses, fitted.values)), imp$m + 1)
# stripplot(imp.pas,data=mmef)
# Use some different methods of plotting spo2 to examine how well it has been imputed.
# SpO2 against probability of data being missing
xyplot(imp.pas, spo2 ~ ps | as.factor(.imp), alpha=0.05)
# SpO2 against severity
xyplot(imp.pas, spo2 ~ fev1_fvc | as.factor(.imp), alpha=0.05)
# Trying to increase distance spread of spo2
xyplot(imp.pas, age~spo2 | as.factor(.imp), na.groups=spo2, cex=c(0.5, 0.1), alpha=0.05)
xyplot(imp.pas, fev1_fvc~spo2 | as.factor(.imp), na.groups=spo2, cex=c(0.5, 0.2), alpha=c(0.05,0.05))
# Box and whiskers plot for each imputation
bwplot(imp.pas, data=spo2)
# Density plot again
densityplot(imp.pas, ~ spo2)
One of the biggest perceived issues is that spo2 has many missing values
that get imputed around the lower values for spo2. However, this is just
a visualisation issue where it’s hard to measure the relative densities.
80% of spo2 is missing, so it needs lots of imputations.
Overall, all imputations look appropriate.
imp.comp <- complete(imp.pas, "long")
imp.comp
## .imp .id hosp_past_year sex decades_from_start age age_2
## 1 1 1 FALSE 1 0.10103728 -0.099313624 1.790955e-02
## 2 1 2 FALSE 0 0.25352041 0.483518926 4.245158e-01
## 3 1 3 FALSE 1 0.45599801 -1.146111559 2.385178e+00
## 4 1 4 FALSE 1 -0.05061262 0.080904335 1.188531e-02
## 5 1 5 FALSE 1 -0.43140383 -0.260359460 1.230874e-01
## 6 1 6 FALSE 1 0.63264514 0.372320610 2.517104e-01
## 7 1 7 FALSE 1 0.35434259 -0.199008666 7.191358e-02
## 8 1 8 FALSE 1 0.78596152 -0.298703707 1.620124e-01
## 9 1 9 FALSE 0 0.58348391 0.153758404 4.292839e-02
## 10 1 10 FALSE 1 -0.02478214 -0.766503517 1.066830e+00
## 11 1 11 FALSE 1 0.05020957 -0.517265913 4.858415e-01
## 12 1 12 FALSE 1 -0.48306478 0.291797692 1.546075e-01
## 13 1 13 FALSE 0 0.81095875 0.303300966 1.670377e-01
## 14 1 14 FALSE 0 -0.63388143 -1.069423066 2.076663e+00
## 15 1 15 FALSE 1 0.43016754 -0.728159270 9.627637e-01
## 16 1 16 FALSE 1 -0.48306478 0.069401061 8.745792e-03
## 17 1 17 FALSE 0 -0.38057612 0.180599377 5.922423e-02
## 18 1 18 FALSE 1 -0.91218398 0.556372995 5.620811e-01
## 19 1 19 FALSE 0 -0.71053962 -0.789510065 1.131833e+00
## 20 1 20 FALSE 0 -0.38057612 -0.547941311 5.451738e-01
## 21 1 21 FALSE 0 -0.38057612 -0.490424941 4.367290e-01
## 22 1 22 FALSE 1 -0.63388143 -0.241187337 1.056272e-01
## 23 1 23 FALSE 1 -0.43140383 0.391492734 2.783007e-01
## 24 1 24 FALSE 1 -0.10144033 0.069401061 8.745792e-03
## 25 1 25 FALSE 1 -0.60805096 -0.356220077 2.304113e-01
## 26 1 26 TRUE 0 0.78596152 0.395327158 2.837790e-01
## 27 1 27 TRUE 0 0.05020957 -0.268028310 1.304452e-01
## 28 1 28 TRUE 0 0.37933983 0.211274774 8.105172e-02
## 29 1 29 TRUE 1 0.53265620 0.069401061 8.745792e-03
## 30 1 30 FALSE 0 -0.60805096 -0.517265913 4.858415e-01
## 31 1 31 FALSE 1 -0.02478214 0.495022200 4.449552e-01
## 32 1 32 FALSE 1 0.27935089 -0.513431489 4.786653e-01
## 33 1 33 FALSE 0 0.22852317 -0.080141501 1.166224e-02
## 34 1 34 FALSE 1 -0.48306478 0.330141939 1.979103e-01
## 35 1 35 FALSE 1 0.10103728 -0.797178914 1.153928e+00
## 36 1 36 FALSE 1 -0.50806202 0.138420705 3.479117e-02
## 37 1 37 TRUE 0 0.10103728 0.230446897 9.642922e-02
## 38 1 38 FALSE 1 -0.86302275 0.659902461 7.907267e-01
## 39 1 39 FALSE 0 0.02437909 -0.007287432 9.643084e-05
## 40 1 40 FALSE 1 -0.43140383 -0.041797254 3.172214e-03
## 41 1 41 FALSE 0 -0.45723431 0.287963268 1.505709e-01
## 42 1 42 FALSE 0 0.37933983 -0.682146174 8.449321e-01
## 43 1 43 TRUE 1 0.27935089 0.241950172 1.062965e-01
## 44 1 44 FALSE 0 -0.93801446 0.832451571 1.258302e+00
## 45 1 45 FALSE 1 0.20269270 0.636895913 7.365527e-01
## height weight bmi fev1 fvc fivc
## 1 0.77713507 0.165919925 -0.20645411 0.156279342 0.163720877 0.272825812
## 2 -0.97812521 -1.005879473 -0.80194940 -0.909744759 -0.915802485 -0.934576628
## 3 1.14946300 0.002413033 -0.51369338 2.007725875 1.673997111 1.705962180
## 4 -0.12708992 -0.161093860 -0.10928358 -0.319694518 0.090728194 -0.093431222
## 5 0.24523801 0.056915330 -0.05860811 -0.159725342 -0.399091125 -0.557356799
## 6 -0.60579727 -0.324600753 -0.04892677 -0.629143089 -0.528749180 -0.549882165
## 7 0.35161742 0.520184860 0.37593013 0.371975486 0.440804943 0.326144864
## 8 1.09627330 0.165919925 -0.34572878 0.763375479 0.793282767 0.576794235
## 9 -0.18027963 -0.842372581 -0.89040684 -0.103998375 -0.551799501 -0.197577780
## 10 0.13885860 -0.188345009 -0.27365567 -0.027291844 0.126744321 0.009718718
## 11 0.08566890 -0.351851902 -0.43100126 0.309036794 0.630009661 0.560350042
## 12 0.29842772 0.228597568 0.09730082 -0.286913949 0.109936795 0.185123448
## 13 -0.49941786 -1.005879473 -0.98127307 -0.081051977 -0.272794575 -0.413345529
## 14 0.13885860 -0.215596158 -0.30377607 -0.049582630 0.195415069 0.043105414
## 15 -0.02071051 -0.488107646 -0.53906847 0.232985874 0.451849889 0.335114424
## 16 0.67075566 0.165919925 -0.15688824 0.910232427 1.129433280 0.641076082
## 17 -0.49941786 -0.597112241 -0.45776172 0.183815020 -0.072544912 -0.144258729
## 18 -0.55260756 -0.444505808 -0.23466649 -0.581283458 -0.128249854 -0.230466167
## 19 -1.08450462 -0.242847307 0.37249022 -0.400334718 -0.375080374 -0.557855108
## 20 -0.18027963 0.629189455 0.85804109 -0.206929361 0.073920669 -0.312687134
## 21 -0.55260756 -0.515358795 -0.32657554 -0.430492841 -0.350109193 -0.484105392
## 22 -0.18027963 0.111417628 0.24284645 0.223151703 0.107535720 0.233957719
## 23 0.45799683 0.084166479 -0.13688102 -0.617342084 -0.063420827 -0.220499989
## 24 0.93670418 0.874449794 0.38285625 0.674212331 0.647297401 0.701869767
## 25 0.19204831 0.520184860 0.47520314 0.479495752 0.527243647 0.708846091
## 26 -0.23346933 -0.523534139 -0.48803949 -0.698637895 -0.502817569 -0.623631881
## 27 -0.87174580 -0.624363390 -0.30932492 0.182503798 -0.163305550 -0.124824682
## 28 -0.49941786 0.111417628 0.44965796 -0.170215124 -0.558042296 -0.578285772
## 29 0.77713507 -0.542609944 -0.88918625 -0.375421485 0.356287100 0.401389506
## 30 -0.07390022 0.029664181 0.08407622 0.196927248 -0.073985557 -0.205052413
## 31 -0.28665904 0.683691753 1.00617027 0.090062593 -0.207485332 -0.093929531
## 32 0.03247919 0.383929116 0.42564328 0.360174481 -0.007715884 0.206052421
## 33 -0.49941786 -0.569861092 -0.42286096 0.185126243 -0.144577165 -0.214520282
## 34 0.35161742 -0.079340414 -0.25643021 -0.454750462 -0.206524902 -0.517990396
## 35 0.61756595 1.146961283 0.84597829 1.469468933 1.129913495 1.200178657
## 36 -0.02071051 0.394829575 0.47242477 -0.499987647 0.202618294 0.081973507
## 37 -0.97812521 -0.678865688 -0.33065534 -0.536701885 -0.710750672 -0.768639768
## 38 -0.76536638 -0.016662772 0.46603235 -0.296748120 -0.395729620 -0.509020836
## 39 -0.18027963 0.193171074 0.33998244 0.225774148 -0.034607925 0.095427847
## 40 0.72394536 0.411180265 0.05708585 0.543745667 0.665065357 0.651540569
## 41 -0.44622815 -0.678865688 -0.58580432 -0.405579609 -0.113843404 -0.246910360
## 42 -0.28665904 -0.929576257 -0.95856530 -0.138090166 -0.146498025 -0.096421075
## 43 0.35161742 0.383929116 0.23221187 -0.338051637 0.554615903 0.625628506
## 44 -0.23346933 -0.351851902 -0.28152779 -0.534079439 -0.647842504 -0.868301546
## 45 0.19204831 -0.569861092 -0.71548026 -0.003034223 0.026859597 0.037624016
## fev1_fvc fev1_fev6 pef mmef dlco spo2
## 1 0.04720582 0.15506913 0.48237335 -0.1219518394 -0.002394092 0.3629031
## 2 -0.87274235 -1.18527843 -0.97589525 -0.5764671571 -0.708260510 0.1248577
## 3 0.58469069 0.52376884 1.27295837 2.3907947748 0.651794114 -0.5420941
## 4 -0.65270566 -0.55771594 -0.51054672 -0.4421785405 -0.266580839 0.1248577
## 5 0.37524963 0.42702641 0.47310610 -0.0823539140 -0.314878233 0.5641677
## 6 -0.58616610 -0.73844318 -0.59880628 -0.4766115191 -0.762112105 0.5641677
## 7 0.03833451 0.01767715 0.27628727 0.1672851809 0.430833535 -0.1664863
## 8 0.16611891 0.38539336 0.71912962 0.1913882659 0.557855682 -0.5420941
## 9 0.93044872 0.89767208 -0.17560168 0.5968365891 -0.271652065 0.1248577
## 10 -0.20513937 -0.27047002 0.52848897 -0.2063126370 0.412239038 0.3629031
## 11 -0.22943267 -0.37868056 -0.15530198 0.1431820959 0.371910714 -0.5420941
## 12 -0.61554005 -0.58719138 -0.12066010 -0.3690084610 -0.263200021 0.7385109
## 13 0.29223467 0.25842665 -0.07101410 -0.0005755898 -0.567715092 -0.1664863
## 14 -0.31654596 -0.46122847 -0.50017622 -0.1503590468 0.008472822 0.7385109
## 15 -0.16352702 -0.14858075 0.33851027 -0.0272611483 0.686085264 0.1248577
## 16 0.02662141 -0.01565843 0.64101991 0.4220892226 0.601806311 0.1248577
## 17 0.45271677 0.45681677 0.33056690 0.2946872018 -0.351825740 0.1248577
## 18 -0.93213525 -0.87163370 -0.62682869 -0.4912455351 -0.302803885 0.8922926
## 19 -0.24861884 -0.35813955 -0.34506004 -0.3827816524 0.125352516 -0.5420941
## 20 -0.44627565 -0.43562966 -0.38521814 -0.3423229026 0.085990140 0.1248577
## 21 -0.35868347 -0.35429129 -0.38168776 -0.4809156415 -0.275032883 0.5641677
## 22 0.23288579 0.17477904 -0.03593092 0.1793367234 0.085024192 -0.1664863
## 23 -1.05129166 -0.71181403 -0.63697854 -0.5988485932 -0.620359252 0.3629031
## 24 0.20882590 0.27916749 0.61476269 0.3213727602 0.499657322 -0.1664863
## 25 0.08896125 0.11835396 0.36565008 0.1845016702 0.258653324 -0.5420941
## 26 -0.81151316 -0.88082532 -0.71552955 -0.5394517051 -0.824174256 1.4691649
## 27 0.62299960 0.66580981 0.11587552 0.4995634245 -0.205484635 -0.5420941
## 28 0.76267115 0.73610660 0.16530088 0.4100376801 -0.222388723 0.7385109
## 29 -0.93814718 -0.97773626 -0.31615503 -0.4129105087 -0.543807882 0.1248577
## 30 0.47982064 0.45122546 0.52804767 0.2137697020 0.185482772 -0.5420941
## 31 0.52501436 0.54463991 0.67058686 0.2111872286 0.001711186 0.3629031
## 32 0.65738850 0.69075053 0.91219741 0.6708674931 0.016200405 -0.1664863
## 33 0.59061830 0.49589334 0.09425193 0.5400221744 -0.050691486 0.1248577
## 34 -0.60104609 -0.53939909 -0.33049721 -0.5170702690 -0.705121179 0.1248577
## 35 0.59272538 0.60669033 0.80871308 1.8725784466 1.194415339 -0.5420941
## 36 -1.02945024 -0.95056188 -0.42625883 -0.5076011999 -0.752935600 0.3629031
## 37 0.07477897 0.05913110 -0.52577149 -0.4060239130 -0.381045663 -0.1664863
## 38 0.03636683 0.04534656 -0.62682869 -0.2700136475 -0.260060691 -0.5420941
## 39 0.46319913 0.43527815 0.20038405 0.3153469889 -0.264890430 -0.1664863
## 40 0.02936738 0.22889585 0.66793908 -0.0470601110 0.806104289 -0.5420941
## 41 -0.60594590 -0.52222808 -0.61138326 -0.4542300830 -0.759938722 0.1248577
## 42 -0.04278908 -0.13548838 -0.06924891 -0.0487817599 -0.025335354 -0.1664863
## 43 -1.00061940 -0.93562076 -0.13279579 -0.4671424500 -0.311497416 0.3629031
## 44 -0.07729268 -0.04712651 -0.73009237 -0.3096115729 -0.917146740 -0.5420941
## 45 -0.03941545 -0.23448141 0.45148250 -0.1193693660 0.104584636 0.1248577
## smoking_status mmrc hosp_5_year
## 1 2 2 FALSE
## 2 0 1 TRUE
## 3 1 1 FALSE
## 4 1 1 FALSE
## 5 2 1 FALSE
## 6 1 2 FALSE
## 7 1 3 TRUE
## 8 1 0 FALSE
## 9 0 2 TRUE
## 10 1 1 TRUE
## 11 1 0 FALSE
## 12 1 3 TRUE
## 13 0 2 FALSE
## 14 1 3 FALSE
## 15 0 0 FALSE
## 16 2 1 FALSE
## 17 1 1 FALSE
## 18 1 2 TRUE
## 19 0 4 FALSE
## 20 1 1 FALSE
## 21 1 1 TRUE
## 22 1 2 FALSE
## 23 1 4 FALSE
## 24 1 1 FALSE
## 25 1 4 FALSE
## 26 2 2 TRUE
## 27 1 1 FALSE
## 28 1 3 TRUE
## 29 2 3 TRUE
## 30 0 0 FALSE
## 31 2 1 FALSE
## 32 2 0 TRUE
## 33 1 1 FALSE
## 34 1 3 TRUE
## 35 1 2 FALSE
## 36 1 0 FALSE
## 37 1 1 FALSE
## 38 1 4 FALSE
## 39 1 1 FALSE
## 40 1 0 FALSE
## 41 1 1 TRUE
## 42 1 3 FALSE
## 43 1 0 FALSE
## 44 1 1 FALSE
## 45 1 2 FALSE
## [ reached 'max' / getOption("max.print") -- omitted 45395 rows ]
#We can quickly fit models to this and assess normality of residuals for predictors
modelFit1 <- glm(data=imp.comp%>%filter(.imp==1), formula=hosp_5_year ~ hosp_past_year + sex + age + height + weight + bmi + fev1 + fvc + fev1_fvc + exp(spo2+101) + mmef + pef + dlco + mmrc + smoking_status, family="binomial")
summary(modelFit1)
##
## Call:
## glm(formula = hosp_5_year ~ hosp_past_year + sex + age + height +
## weight + bmi + fev1 + fvc + fev1_fvc + exp(spo2 + 101) +
## mmef + pef + dlco + mmrc + smoking_status, family = "binomial",
## data = imp.comp %>% filter(.imp == 1))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.250e+00 2.232e-01 -14.559 < 2e-16 ***
## hosp_past_yearTRUE 1.320e+00 1.283e-01 10.288 < 2e-16 ***
## sex1 1.770e-01 1.356e-01 1.305 0.19180
## age -9.468e-02 1.180e-01 -0.802 0.42249
## height 2.915e-01 4.547e-01 0.641 0.52145
## weight -4.497e-01 9.282e-01 -0.484 0.62805
## bmi 7.291e-01 7.911e-01 0.922 0.35675
## fev1 -5.861e-01 6.021e-01 -0.974 0.33030
## fvc -4.362e-01 4.112e-01 -1.061 0.28879
## fev1_fvc -8.905e-01 3.017e-01 -2.952 0.00316 **
## exp(spo2 + 101) 1.381e-45 9.033e-46 1.529 0.12620
## mmef 1.072e-03 2.601e-01 0.004 0.99671
## pef -5.288e-02 2.412e-01 -0.219 0.82643
## dlco -7.040e-01 1.671e-01 -4.213 2.52e-05 ***
## mmrc1 2.569e-01 1.570e-01 1.636 0.10187
## mmrc2 3.476e-01 1.665e-01 2.087 0.03688 *
## mmrc3 4.514e-01 1.682e-01 2.683 0.00729 **
## mmrc4 4.911e-01 1.742e-01 2.818 0.00483 **
## smoking_status1 4.521e-01 1.652e-01 2.737 0.00620 **
## smoking_status2 9.166e-01 1.823e-01 5.029 4.94e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3945.5 on 4543 degrees of freedom
## Residual deviance: 3236.7 on 4524 degrees of freedom
## AIC: 3276.7
##
## Number of Fisher Scoring iterations: 6
plot(modelFit1)
anova(modelFit1, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: hosp_5_year
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 4543 3945.5
## hosp_past_year 1 196.67 4542 3748.8 < 2.2e-16 ***
## sex 1 4.44 4541 3744.4 0.0350861 *
## age 1 20.99 4540 3723.4 4.607e-06 ***
## height 1 15.24 4539 3708.1 9.479e-05 ***
## weight 1 6.17 4538 3702.0 0.0130285 *
## bmi 1 1.69 4537 3700.3 0.1929467
## fev1 1 354.29 4536 3346.0 < 2.2e-16 ***
## fvc 1 16.78 4535 3329.2 4.188e-05 ***
## fev1_fvc 1 14.29 4534 3314.9 0.0001567 ***
## exp(spo2 + 101) 1 4.50 4533 3310.4 0.0339895 *
## mmef 1 0.00 4532 3310.4 0.9540787
## pef 1 1.05 4531 3309.4 0.3065100
## dlco 1 32.71 4530 3276.6 1.071e-08 ***
## mmrc 4 9.97 4526 3266.7 0.0409319 *
## smoking_status 2 30.00 4524 3236.7 3.061e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
res <- residuals(modelFit1, type="deviance")
plot(log(predict(modelFit1)), res)
## Warning in log(predict(modelFit1)): NaNs produced
abline(h=0, lty=2)
qqnorm(res)
qqline(res)
# Tasks to do
#
# Create and save imputed datasets
# Create model with no feature selection, run prediction on test set.
# Create model with feature selection, regularisation, etc. on test set.
# Create XGBoost model on test set.
# Compare performances for all
means[["spo2"]]
## [1] 1.513808